# -*- coding: utf-8 -*-
 
""" 数据预处理 """
import re
import pandas as pd
# 1. 设置行不限制数量
pd.set_option('display.max_rows',None)
# 2. 设置列不限制数量
pd.set_option('display.max_columns',None)

df = pd.read_csv("2020年3月30日佛山二手房.csv")
df = df.iloc[:,1:]
# print(df.head(1))

def parse_info_math(row):
    """
    获取字符串中的数字
    :param row: pandas的series
    :return:
    """
    return re.sub('\D','',row)
def parse_info_size(row):

    return re.sub('平米','',row)
def parse_info_direciton(row):

    return row[0]

def parse_info_house_type(row):
    beds = row[0]
    rooms = row[2]
    return pd.Series({'beds':beds,"rooms":rooms})

df["unitPrice"] = df["unitPrice"].apply(parse_info_math)
df["size"] = df["size"].apply(parse_info_size)
df["direction"] = df["direction"].apply(parse_info_direciton)
df["position"] = df["position"].apply(parse_info_math)
beds_and_rooms = df["house_type"].apply(parse_info_house_type)
df = df.join(beds_and_rooms)
#删除title
df = df.drop(["title","house_type"],axis=1)
# print(df.head(1))
df.to_csv("ml.csv",index=False)
